{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "65a66bf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/file_utils.ipynb\"\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "07ea7f66-5dd2-4a4f-b87f-0b006129cf00",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e623360d",
   "metadata": {},
   "source": [
    "# Part of speech tagging using spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "431f63e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pos_tag_spacy(text, model):\n",
    "    doc = model(text)\n",
    "    words = [token.text for token in doc]\n",
    "    pos = [token.pos_ for token in doc]\n",
    "    return list(zip(words, pos))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "6969d570",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('To', 'ADP'), ('Sherlock', 'PROPN'), ('Holmes', 'PROPN'), ('she', 'PRON'), ('is', 'AUX'), ('always', 'ADV'), ('_', 'PUNCT'), ('the', 'DET'), ('_', 'PROPN'), ('woman', 'NOUN'), ('.', 'PUNCT'), ('I', 'PRON'), ('have', 'AUX'), ('seldom', 'ADV'), ('heard', 'VERB'), ('him', 'PRON'), ('\\n', 'SPACE'), ('mention', 'VERB'), ('her', 'PRON'), ('under', 'ADP'), ('any', 'DET'), ('other', 'ADJ'), ('name', 'NOUN'), ('.', 'PUNCT'), ('In', 'ADP'), ('his', 'PRON'), ('eyes', 'NOUN'), ('she', 'PRON'), ('eclipses', 'VERB'), ('and', 'CCONJ'), ('\\n', 'SPACE'), ('predominates', 'VERB'), ('the', 'DET'), ('whole', 'NOUN'), ('of', 'ADP'), ('her', 'PRON'), ('sex', 'NOUN'), ('.', 'PUNCT'), ('It', 'PRON'), ('was', 'AUX'), ('not', 'PART'), ('that', 'SCONJ'), ('he', 'PRON'), ('felt', 'VERB'), ('any', 'DET'), ('emotion', 'NOUN'), ('\\n', 'SPACE'), ('akin', 'ADJ'), ('to', 'PART'), ('love', 'VERB'), ('for', 'ADP'), ('Irene', 'PROPN'), ('Adler', 'PROPN'), ('.', 'PUNCT'), ('All', 'DET'), ('emotions', 'NOUN'), (',', 'PUNCT'), ('and', 'CCONJ'), ('that', 'DET'), ('one', 'NUM'), ('particularly', 'ADV'), (',', 'PUNCT'), ('\\n', 'SPACE'), ('were', 'AUX'), ('abhorrent', 'ADJ'), ('to', 'ADP'), ('his', 'PRON'), ('cold', 'ADJ'), (',', 'PUNCT'), ('precise', 'ADJ'), ('but', 'CCONJ'), ('admirably', 'ADV'), ('balanced', 'ADJ'), ('mind', 'NOUN'), ('.', 'PUNCT'), ('He', 'PRON'), ('\\n', 'SPACE'), ('was', 'AUX'), (',', 'PUNCT'), ('I', 'PRON'), ('take', 'VERB'), ('it', 'PRON'), (',', 'PUNCT'), ('the', 'DET'), ('most', 'ADV'), ('perfect', 'ADJ'), ('reasoning', 'NOUN'), ('and', 'CCONJ'), ('observing', 'VERB'), ('machine', 'NOUN'), ('that', 'SCONJ'), ('\\n', 'SPACE'), ('the', 'DET'), ('world', 'NOUN'), ('has', 'AUX'), ('seen', 'VERB'), (',', 'PUNCT'), ('but', 'CCONJ'), ('as', 'ADP'), ('a', 'DET'), ('lover', 'NOUN'), ('he', 'PRON'), ('would', 'AUX'), ('have', 'AUX'), ('placed', 'VERB'), ('himself', 'PRON'), ('in', 'ADP'), ('a', 'DET'), ('\\n', 'SPACE'), ('false', 'ADJ'), ('position', 'NOUN'), ('.', 'PUNCT'), ('He', 'PRON'), ('never', 'ADV'), ('spoke', 'VERB'), ('of', 'ADP'), ('the', 'DET'), ('softer', 'ADJ'), ('passions', 'NOUN'), (',', 'PUNCT'), ('save', 'VERB'), ('with', 'ADP'), ('a', 'DET'), ('gibe', 'NOUN'), ('\\n', 'SPACE'), ('and', 'CCONJ'), ('a', 'DET'), ('sneer', 'NOUN'), ('.', 'PUNCT'), ('They', 'PRON'), ('were', 'AUX'), ('admirable', 'ADJ'), ('things', 'NOUN'), ('for', 'ADP'), ('the', 'DET'), ('observer', 'NOUN'), ('—', 'PUNCT'), ('excellent', 'ADJ'), ('for', 'ADP'), ('\\n', 'SPACE'), ('drawing', 'VERB'), ('the', 'DET'), ('veil', 'NOUN'), ('from', 'ADP'), ('men', 'NOUN'), ('’s', 'PART'), ('motives', 'NOUN'), ('and', 'CCONJ'), ('actions', 'NOUN'), ('.', 'PUNCT'), ('But', 'CCONJ'), ('for', 'SCONJ'), ('the', 'DET'), ('trained', 'VERB'), ('\\n', 'SPACE'), ('reasoner', 'NOUN'), ('to', 'PART'), ('admit', 'VERB'), ('such', 'ADJ'), ('intrusions', 'NOUN'), ('into', 'ADP'), ('his', 'PRON'), ('own', 'ADJ'), ('delicate', 'ADJ'), ('and', 'CCONJ'), ('finely', 'ADV'), ('\\n', 'SPACE'), ('adjusted', 'VERB'), ('temperament', 'NOUN'), ('was', 'AUX'), ('to', 'PART'), ('introduce', 'VERB'), ('a', 'DET'), ('distracting', 'VERB'), ('factor', 'NOUN'), ('which', 'PRON'), ('might', 'AUX'), ('\\n', 'SPACE'), ('throw', 'VERB'), ('a', 'DET'), ('doubt', 'NOUN'), ('upon', 'SCONJ'), ('all', 'DET'), ('his', 'PRON'), ('mental', 'ADJ'), ('results', 'NOUN'), ('.', 'PUNCT'), ('Grit', 'NOUN'), ('in', 'ADP'), ('a', 'DET'), ('sensitive', 'ADJ'), ('\\n', 'SPACE'), ('instrument', 'NOUN'), (',', 'PUNCT'), ('or', 'CCONJ'), ('a', 'DET'), ('crack', 'NOUN'), ('in', 'ADP'), ('one', 'NUM'), ('of', 'ADP'), ('his', 'PRON'), ('own', 'ADJ'), ('high', 'ADJ'), ('-', 'PUNCT'), ('power', 'NOUN'), ('lenses', 'NOUN'), (',', 'PUNCT'), ('would', 'AUX'), ('not', 'PART'), ('\\n', 'SPACE'), ('be', 'AUX'), ('more', 'ADV'), ('disturbing', 'ADJ'), ('than', 'ADP'), ('a', 'DET'), ('strong', 'ADJ'), ('emotion', 'NOUN'), ('in', 'ADP'), ('a', 'DET'), ('nature', 'NOUN'), ('such', 'ADJ'), ('as', 'ADP'), ('his', 'PRON'), ('.', 'PUNCT'), ('And', 'CCONJ'), ('\\n', 'SPACE'), ('yet', 'ADV'), ('there', 'PRON'), ('was', 'VERB'), ('but', 'CCONJ'), ('one', 'NUM'), ('woman', 'NOUN'), ('to', 'ADP'), ('him', 'PRON'), (',', 'PUNCT'), ('and', 'CCONJ'), ('that', 'DET'), ('woman', 'NOUN'), ('was', 'AUX'), ('the', 'DET'), ('late', 'ADJ'), ('Irene', 'PROPN'), ('\\n', 'SPACE'), ('Adler', 'PROPN'), (',', 'PUNCT'), ('of', 'ADP'), ('dubious', 'ADJ'), ('and', 'CCONJ'), ('questionable', 'ADJ'), ('memory', 'NOUN'), ('.', 'PUNCT')]\n"
     ]
    }
   ],
   "source": [
    "text = read_text_file(\"../data/sherlock_holmes_1.txt\")\n",
    "words_with_pos = pos_tag_spacy(text, small_model)\n",
    "print(words_with_pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46efd3e4",
   "metadata": {},
   "source": [
    "# Part of speech tagging using NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "134eaf61",
   "metadata": {},
   "outputs": [],
   "source": [
    "#nltk.download('averaged_perceptron_tagger') # Run the first time you run the notebook\n",
    "def pos_tag_nltk(text):\n",
    "    words = word_tokenize_nltk(text)\n",
    "    words_with_pos = nltk.pos_tag(words)\n",
    "    return words_with_pos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "941ade4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('To', 'TO'), ('Sherlock', 'NNP'), ('Holmes', 'NNP'), ('she', 'PRP'), ('is', 'VBZ'), ('always', 'RB'), ('_the_', 'JJ'), ('woman', 'NN'), ('.', '.'), ('I', 'PRP'), ('have', 'VBP'), ('seldom', 'VBN'), ('heard', 'RB'), ('him', 'PRP'), ('mention', 'VB'), ('her', 'PRP'), ('under', 'IN'), ('any', 'DT'), ('other', 'JJ'), ('name', 'NN'), ('.', '.'), ('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('not', 'RB'), ('that', 'IN'), ('he', 'PRP'), ('felt', 'VBD'), ('any', 'DT'), ('emotion', 'NN'), ('akin', 'NN'), ('to', 'TO'), ('love', 'VB'), ('for', 'IN'), ('Irene', 'NNP'), ('Adler', 'NNP'), ('.', '.'), ('All', 'DT'), ('emotions', 'NNS'), (',', ','), ('and', 'CC'), ('that', 'IN'), ('one', 'CD'), ('particularly', 'RB'), (',', ','), ('were', 'VBD'), ('abhorrent', 'JJ'), ('to', 'TO'), ('his', 'PRP$'), ('cold', 'NN'), (',', ','), ('precise', 'NN'), ('but', 'CC'), ('admirably', 'RB'), ('balanced', 'VBD'), ('mind', 'NN'), ('.', '.'), ('He', 'PRP'), ('was', 'VBD'), (',', ','), ('I', 'PRP'), ('take', 'VBP'), ('it', 'PRP'), (',', ','), ('the', 'DT'), ('most', 'RBS'), ('perfect', 'JJ'), ('reasoning', 'NN'), ('and', 'CC'), ('observing', 'VBG'), ('machine', 'NN'), ('that', 'IN'), ('the', 'DT'), ('world', 'NN'), ('has', 'VBZ'), ('seen', 'VBN'), (',', ','), ('but', 'CC'), ('as', 'IN'), ('a', 'DT'), ('lover', 'NN'), ('he', 'PRP'), ('would', 'MD'), ('have', 'VB'), ('placed', 'VBN'), ('himself', 'PRP'), ('in', 'IN'), ('a', 'DT'), ('false', 'JJ'), ('position', 'NN'), ('.', '.'), ('He', 'PRP'), ('never', 'RB'), ('spoke', 'VBD'), ('of', 'IN'), ('the', 'DT'), ('softer', 'JJR'), ('passions', 'NNS'), (',', ','), ('save', 'VBP'), ('with', 'IN'), ('a', 'DT'), ('gibe', 'NN'), ('and', 'CC'), ('a', 'DT'), ('sneer', 'NN'), ('.', '.'), ('They', 'PRP'), ('were', 'VBD'), ('admirable', 'JJ'), ('things', 'NNS'), ('for', 'IN'), ('the', 'DT'), ('observer—excellent', 'NN'), ('for', 'IN'), ('drawing', 'VBG'), ('the', 'DT'), ('veil', 'NN'), ('from', 'IN'), ('men', 'NNS'), ('’', 'VBP'), ('s', 'JJ'), ('motives', 'NNS'), ('and', 'CC'), ('actions', 'NNS'), ('.', '.'), ('But', 'CC'), ('for', 'IN'), ('the', 'DT'), ('trained', 'JJ'), ('reasoner', 'NN'), ('to', 'TO'), ('admit', 'VB'), ('such', 'JJ'), ('intrusions', 'NNS'), ('into', 'IN'), ('his', 'PRP$'), ('own', 'JJ'), ('delicate', 'NN'), ('and', 'CC'), ('finely', 'RB'), ('adjusted', 'VBD'), ('temperament', 'NN'), ('was', 'VBD'), ('to', 'TO'), ('introduce', 'VB'), ('a', 'DT'), ('distracting', 'NN'), ('factor', 'NN'), ('which', 'WDT'), ('might', 'MD'), ('throw', 'VB'), ('a', 'DT'), ('doubt', 'NN'), ('upon', 'IN'), ('all', 'PDT'), ('his', 'PRP$'), ('mental', 'JJ'), ('results', 'NNS'), ('.', '.'), ('Grit', 'NNP'), ('in', 'IN'), ('a', 'DT'), ('sensitive', 'JJ'), ('instrument', 'NN'), (',', ','), ('or', 'CC'), ('a', 'DT'), ('crack', 'NN'), ('in', 'IN'), ('one', 'CD'), ('of', 'IN'), ('his', 'PRP$'), ('own', 'JJ'), ('high-power', 'NN'), ('lenses', 'NNS'), (',', ','), ('would', 'MD'), ('not', 'RB'), ('be', 'VB'), ('more', 'RBR'), ('disturbing', 'JJ'), ('than', 'IN'), ('a', 'DT'), ('strong', 'JJ'), ('emotion', 'NN'), ('in', 'IN'), ('a', 'DT'), ('nature', 'NN'), ('such', 'JJ'), ('as', 'IN'), ('his', 'PRP$'), ('.', '.'), ('And', 'CC'), ('yet', 'RB'), ('there', 'EX'), ('was', 'VBD'), ('but', 'CC'), ('one', 'CD'), ('woman', 'NN'), ('to', 'TO'), ('him', 'PRP'), (',', ','), ('and', 'CC'), ('that', 'IN'), ('woman', 'NN'), ('was', 'VBD'), ('the', 'DT'), ('late', 'JJ'), ('Irene', 'NNP'), ('Adler', 'NNP'), (',', ','), ('of', 'IN'), ('dubious', 'JJ'), ('and', 'CC'), ('questionable', 'JJ'), ('memory', 'NN'), ('.', '.')]\n"
     ]
    }
   ],
   "source": [
    "words_with_pos = pos_tag_nltk(text)\n",
    "print(words_with_pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8d80765",
   "metadata": {},
   "source": [
    "# Compare running times for NLTK and spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "2b6eabcf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLTK: 0.006206035614013672 s\n",
      "spaCy: 0.02251291275024414 s\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "start = time.time()\n",
    "pos_tag_nltk(text)\n",
    "print(f\"NLTK: {time.time() - start} s\")\n",
    "\n",
    "start = time.time()\n",
    "pos_tag_spacy(text, small_model)\n",
    "print(f\"spaCy: {time.time() - start} s\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56064520",
   "metadata": {},
   "source": [
    "# Get part of speech tags using GPT-3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "6d9fc4b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "client = OpenAI(api_key=OPEN_AI_KEY)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "cbd9bca9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Here are the part of speech tags for the sentence \"In his eyes she eclipses and predominates the whole of her sex\" in the format of a Python tuple:\n",
      "\n",
      "[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')]\n"
     ]
    }
   ],
   "source": [
    "prompt=\"\"\"Decide what the part of speech tags are for a sentence. \n",
    "Preserve original capitalization. \n",
    "Return the list in the format of a python tuple: (word, part of speech). \n",
    "Sentence: In his eyes she eclipses and predominates the whole of her sex.\"\"\"\n",
    "response = client.chat.completions.create(\n",
    "    model=\"gpt-3.5-turbo\",\n",
    "    temperature=0,\n",
    "    max_tokens=256,\n",
    "    top_p=1.0,\n",
    "    frequency_penalty=0,\n",
    "    presence_penalty=0,\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "        {\"role\": \"user\", \"content\": prompt}\n",
    "    ], \n",
    ")\n",
    "print(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "82e1c838-16f9-403c-b92e-42dba731f940",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChatCompletion(id='chatcmpl-9hCq34UAzMiNiqNGopt2U8ZmZM5po', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are the part of speech tags for the sentence \"In his eyes she eclipses and predominates the whole of her sex\" in the format of a Python tuple:\\n\\n[(\\'In\\', \\'IN\\'), (\\'his\\', \\'PRP$\\'), (\\'eyes\\', \\'NNS\\'), (\\'she\\', \\'PRP\\'), (\\'eclipses\\', \\'VBZ\\'), (\\'and\\', \\'CC\\'), (\\'predominates\\', \\'VBZ\\'), (\\'the\\', \\'DT\\'), (\\'whole\\', \\'JJ\\'), (\\'of\\', \\'IN\\'), (\\'her\\', \\'PRP$\\'), (\\'sex\\', \\'NN\\')]', role='assistant', function_call=None, tool_calls=None))], created=1720084483, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=120, prompt_tokens=74, total_tokens=194))\n"
     ]
    }
   ],
   "source": [
    "print(response.choices[0].message.content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "aa82c73a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from ast import literal_eval\n",
    "\n",
    "def pos_tag_gpt(text, client):\n",
    "    prompt = f\"\"\"Decide what the part of speech tags are for a sentence. \n",
    "    Preserve original capitalization. \n",
    "    Return the list in the format of a python tuple: (word, part of speech).\n",
    "    Do not include any other explanations.\n",
    "    Sentence: {text}.\"\"\"\n",
    "\n",
    "    response = client.chat.completions.create(\n",
    "        model=\"gpt-3.5-turbo\",\n",
    "        temperature=0,\n",
    "        max_tokens=256,\n",
    "        top_p=1.0,\n",
    "        frequency_penalty=0,\n",
    "        presence_penalty=0,\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "            {\"role\": \"user\", \"content\": prompt}\n",
    "        ],    \n",
    "    )\n",
    "    result = response.choices[0].message.content\n",
    "    result = result.replace(\"\\n\", \"\")\n",
    "    result = list(literal_eval(result))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "a6d4c076",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'JJ'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN')]\n",
      "GPT: 2.4942469596862793 s\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "first_sentence = \"In his eyes she eclipses and predominates the whole of her sex.\"\n",
    "words_with_pos = pos_tag_gpt(first_sentence, client)\n",
    "print(words_with_pos)\n",
    "print(f\"GPT: {time.time() - start} s\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "fd9a599d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.')]\n"
     ]
    }
   ],
   "source": [
    "words_with_pos_nltk = pos_tag_nltk(first_sentence)\n",
    "print(words_with_pos_nltk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "39e321cf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    }
   ],
   "source": [
    "print(words_with_pos == words_with_pos_nltk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "20ef6931-f9d1-465c-80ae-409793a1a065",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401.9066461774875\n"
     ]
    }
   ],
   "source": [
    "print(2.4942469596862793/0.006206035614013672)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a89f14f-fff8-46de-8e16-f3618c1784da",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
